package com.xuxueli.crawler.test;

import com.xuxueli.crawler.XxlCrawler;
import com.xuxueli.crawler.annotation.PageFieldSelect;
import com.xuxueli.crawler.annotation.PageSelect;
import com.xuxueli.crawler.conf.XxlCrawlerConf;
import com.xuxueli.crawler.loader.strategy.SeleniumChromePageLoader;
import com.xuxueli.crawler.loader.strategy.SeleniumPhantomjsPageLoader;
import com.xuxueli.crawler.parser.PageParser;
import com.xuxueli.crawler.util.FileUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
 * 爬虫示例10：爬取页面，下载PDF文件
 *
 * @author
 */
public class XxlCrawlerTest10 {
    private static Logger logger = LoggerFactory.getLogger(XxlCrawlerTest10.class);

    @PageSelect(cssQuery = "body")
    public static class PageVo {

        @PageFieldSelect(cssQuery = "#projectname", selectType = XxlCrawlerConf.SelectType.HTML)
        private String projectname;

        @PageFieldSelect(cssQuery = "img", selectType = XxlCrawlerConf.SelectType.ATTR, selectVal = "data-url")
        private List<String> pdfs;

        public String getProjectname() { return projectname; }

        public void setProjectname(String projectname) { this.projectname = projectname; }

        public List<String> getPdfs() {
            return pdfs;
        }

        public void setPdfs(List<String> pdfs) {
            this.pdfs = pdfs;
        }

        @Override
        public String toString() {
            return "PageVo{" +
                    "projectname='" + projectname + '\'' +
                    ", pdfs=" + pdfs +
                    '}';
        }
    }

    public static void main(String[] args) {

        String driverPath = "F:\\Downloads\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe";

        XxlCrawler crawler = new XxlCrawler.Builder()
                .setUrls("http://pa.ebfire.cn:8094/viewReport/report.html?uqreportid=886d799b-f77b-4b5f-9906-94a9d5b58ec7")
                .setAllowSpread(false)
                .setPageLoader(new SeleniumChromePageLoader(driverPath))        // "selenisum + chromehandless" 版本 PageLoader：支持 JS 渲染
                .setPageParser(new PageParser<PageVo>() {
                    @Override
                    public void parse(Document html, Element pageVoElement, PageVo pageVo) {
                        System.out.println(pageVo.toString());
                        Element linkElement = html.select("#projectname").first();; //通过选择器查找DOM
                        System.out.println("html："+ linkElement.html());
                        System.out.println("纯文本："+ linkElement.text());

                        // 文件信息
                        String filePath = "C:\\Users\\ZDK\\Desktop";



//                        if (pageVo.getPdfs()!=null && pageVo.getPdfs().size() > 0) {
//                            Set<String> pdfsSet = new HashSet<>(pageVo.getPdfs());
//                            for (String pdf: pdfsSet) {
//                                // 下载文件
//                                String fileName = FileUtil.getFileNameByUrl(pdf, null);
//                                boolean ret = FileUtil.downFile(pdf, XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT, filePath, fileName);
//                                System.out.println("down pdfs " + (ret?"success":"fail") + "：" + pdf);
//                            }
//                            System.out.println("down pdfs " + html.html());
//                            System.out.println("down pdfs " + (pageVo.getPdfs().size()) + "：" + pageVo.getPdfs().get(0));
//                        }
                    }
                })
                .build();

        System.out.println("start");
        crawler.start(true);
        System.out.println("end");
    }

}
